Setup

Load packages

library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.border = element_rect(colour = "black", fill = NA),
  text = element_text(size = 20),
  strip.placement = "outside",
  strip.text = element_text(size = 20, color = "black"),
  strip.background = element_rect(fill = "white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(GenomicRanges)
library(openxlsx)
require(ggraph)
require(igraph)
source("scripts/functions.R")
source("../motif-analysis/mta_downstream_functions.R")

Data directories and metadata

atac_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "results"
)
cns_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "consensus_peaks"
)
bnd_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "footprint", "BINDetect"
)
rna_dir <- file.path(
  "RNASEQ_QUANTIFICATION", "results"
)
pub_dir <- file.path("published")
res_dir <- file.path("results")
fig_dir <- file.path("plots")
for (newdir in c(res_dir, fig_dir))
  dir.create(newdir, showWarnings = FALSE)

# metadata
des_fn <- file.path("RNASEQ_QUANTIFICATION", "design_table.tsv")
des_dt <- fread(des_fn)
setnames(des_dt, "reporter_line", "reporterline")
col_dt <- des_dt[, .(sample, reporterline, condition)]
col_dt[, reporterline := factor(
  reporterline,
  levels = c("Elav", "Fox", "Ncol")
)]
col_dt[, condition := factor(condition, levels = c("negative", "positive"))]
setorder(col_dt, reporterline, condition)

# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols <- c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
line_cond_cols <- c(
  "Elav_pos" = "#ff7f00", "Fox_pos" = "#984ea3", "Ncol_pos" = "#4daf4a",
  "Elav_neg" = "#ebbd8f", "Fox_neg" = "#d18adb", "Ncol_neg" = "#90d18e"
)

Gene annotations and lists

marks_gold <- fread(
  file.path("annotation", "golden-marks-231124.tsv"),
  fill = TRUE, sep = "\t", header = FALSE
)[, c(2:1)]
setnames(marks_gold, c("gene", "name"))
marks_gold_v <- structure(marks_gold$name, names = marks_gold$gene)

tfs_annt <- fread(
  file.path("annotation", "tfs.Nvec.tsv"),
  header = TRUE
)[, .SD[1], gene]
setnames(tfs_annt, c("gene", "name", "pfam"))

gene_annt <- fread(
  file.path("annotation", "Nvec_annotation_v3_2020-10-23_DToL_names"),
  select = 1:3
)
setnames(gene_annt, c("gene", "pfam", "name"))

gene_ann <- rbindlist(list(
  gene_annt[!gene %in% tfs_annt$gene],
  tfs_annt
), use.names = TRUE)
gene_ann[gene %in% marks_gold$gene, name := marks_gold_v[gene]]

Load normalized gene expression (RNA-seq) and motif accessibility (ATAC-seq) data

# GENES

# size norm gene expression
exps_mt <- read.table(
  file.path(rna_dir, "mat_norm.tsv")
)

# size and gene norm gene expression
expr_mt <- read.table(
  file.path(rna_dir, "norm_expression.tsv")
)

# gene expression TPMs
tpms_mt <- read.table(
  file.path(rna_dir, "tpm_expression.tsv")
)

# gene clusters
gene_clust <- fread(
  file.path(rna_dir, "genes_clusters.tsv")
)

# PEAKS

# peak accessibility
accs_mt <- read.table(
  file.path(atac_dir, "mat_norm.tsv")
)

# peak to gene assignment
assign <- fread(
  file.path(atac_dir, "consensusSeekeR-peaks-gene-assignment.tsv")
)

# peak clusters
pks_clust <- fread(
  file.path(atac_dir, "consensusSeekeR-peaks-clusters.bed")
)
bed_cols <- c(
  "seqnames", "start", "end", "peak", "score", "strand", "cluster", "group"
)
setnames(pks_clust, bed_cols)

# MOTIFS

# motif enrichment scores
motf_dt <- fread(
  file.path(atac_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(motf_dt, "label", "group")

# motif enrichment qvalue
motf_qv <- read.table(
  file.path(atac_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
  sep = "\t"
)

# motif enrichment log2fc
motf_fc <- read.table(
  file.path(atac_dir, "motif-enrich-archetypes-mona-fc.tsv"),
  sep = "\t"
)

# motif to tf assignment
dc <- fread(file.path(
  "annotation",
  "assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]

Correlation between motif binding and gene expression

Load fold changes for expression and motif binding

# binding comparisons fold changes for best correlated motifs
dp_dt <- fread(file.path(
  bnd_dir, "bindetect_results_reporterline.txt"
))
dp_dt[, id := paste(reporterline, vs, sep = "_vs_")]
dp_dt[, motif_gene := paste(motif, gene, sep = "__")]
dt_atac <- dcast.data.table(
  dp_dt, motif_gene ~ id, value.var = "log2FoldChange"
)
mt_atac <- as.matrix(dt_atac[, -1])
rownames(mt_atac) <- dt_atac$motif_gene

# RNA comparisons fold changes
mt_rna <- read.table(
  file.path(rna_dir, "log2fc_expression.tsv"),
  sep = "\t",
  header = TRUE
)

Dot plot of expression and binding score

dt_rna_plot <- melt.data.table(
  as.data.table(mt_rna, keep.rownames = "gene"),
  id.vars = "gene", variable.name = "comparison", value.name = "rna_fc"
)
dt_atac_plot <- melt.data.table(
  as.data.table(mt_atac, keep.rownames = "motif_gene"),
  id.vars = "motif_gene", variable.name = "comparison", value.name = "atac_fc"
)
dt_atac_plot[, gene := str_remove(motif_gene, "ARCH\\d+__")]
dt_atac_plot[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_comb <- merge.data.table(
  dt_rna_plot, dt_atac_plot,
  by = c("gene", "comparison")
)

# filtering
ids_rna <- rownames(mt_rna)[apply(mt_rna, 1, function(x) max(x) > 1.5)]
ids_atac <- rownames(mt_atac)[apply(mt_atac, 1, function(x) max(x) > 0.1)]
dt_plot <- dt_comb[gene %in% ids_rna & motif_gene %in% ids_atac]

# add gene annotation
dt_plot <- merge.data.table(
  dt_plot, gene_ann, by = "gene",
  all.x = TRUE, sort = FALSE
)

# label for plotting
dt_plot[, label := paste(
  substr(name, 1, 30), gene, motif,
  sep = " | "
)]

# clustering
tfs <- unique(dt_plot$gene)
hc <- hclust(dist(mt_rna[tfs, ]), method = "ward.D2")
ord <- tfs[hc$order]

# ordering
tfs <- unique(dt_plot$gene)
mord <- order(apply(mt_rna[tfs,], 1, which.max))
ord <- tfs[mord]

# order genes
dt_plot[, gene := factor(gene, levels = rev(ord))]
setorder(dt_plot, gene)
dt_plot[, label := factor(label, levels = unique(dt_plot$label))]
setorder(dt_plot, label)

# limits for plotting
dt_plot[rna_fc < 0, rna_fc := 0]
dt_plot[rna_fc > 8, rna_fc := 8]

# plot
require(ggplot2)
require(scales)
gp_dot <- ggplot(dt_plot, aes(comparison, label)) +
  geom_point(
    aes(size = rna_fc, fill = atac_fc),
    shape = 21,
    color = "black"
  ) +
  scale_fill_gradientn(
    colours = colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(1000),
    # limits = c(-5, 5), oob = squish, breaks = c(-5, 0, 5),
    name = "motif binding\nlog2(fold change)",
  ) +
  scale_size_continuous(
    # range = c(1, 7),
    # breaks = c(0, 2, 4, 6),
    name = "gene expression\nlog2(fold change)"
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
    legend.direction = "vertical"
  )
gp_dot

Select correlated genes

mt_bnd <- copy(mt_atac)
rownames(mt_bnd) <- str_remove(rownames(mt_bnd),  "ARCH\\d+__")
mt_exp <- mt_rna[rownames(mt_bnd), colnames(mt_bnd)]
rownames(mt_bnd) <- rownames(mt_atac)
rownames(mt_exp) <- rownames(mt_atac)

# correlation between same comparisons (columns)
mt_cors_smp <- cor(mt_bnd, mt_exp, method = "spearman")
diag(mt_cors_smp)
##  Elav_vs_Fox Elav_vs_Ncol  Elav_vs_neg  Fox_vs_Elav  Fox_vs_Ncol   Fox_vs_neg 
##    0.1808882    0.1054204    0.3131112    0.1808882    0.1976934    0.2128027 
## Ncol_vs_Elav  Ncol_vs_Fox  Ncol_vs_neg 
##    0.1054204    0.1976934    0.1407049
# correlation between genes (rows)
mt_cors_gen <- cor(t(mt_bnd), t(mt_exp), method = "spearman")
cors_gen <- diag(mt_cors_gen)
names(cors_gen) <- rownames(mt_cors_gen)
cors_gen <- sort(cors_gen, decreasing = TRUE)

# select correlated genes
select_pos <- names(cors_gen[(cors_gen) > 0.5])

# correlation distribution
dt_cors <- data.table(cor = cors_gen)
dt_cors$motif_gene <- names(cors_gen)
dt_cors[, correlation := "no"]
dt_cors[motif_gene %in% select_pos, correlation := "positive"]
dt_cors[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_cors[, gene := str_remove(motif_gene, "ARCH\\d+__")]

# plot
gp_hist <- ggplot(dt_cors, aes(cor, fill = correlation)) +
  geom_histogram(color = "black", binwidth = 0.05) +
  scale_y_continuous(expand = expansion(c(0, 0.1))) +
  scale_fill_manual(values = c(
    "positive" = "green",
    "no" = "grey"
  )) +
  labs(x = "correlation", y = "number of genes") +
  theme(legend.position = "none")
gp_hist

Inspect correlations between gene expression and motif binding score per sample

mt_ord <- cbind(mt_exp, mt_bnd)
colnames(mt_ord) <- paste(
  colnames(mt_ord),
  rep(c("RNA", "ATAC"), each = 9),
  sep = "_"
)
mt_dt <- as.data.table(mt_ord, keep.rownames = "motif_gene")
mt_dt[, motif := str_extract(motif_gene, "ARCH\\d+")]
mt_dt[, gene := str_remove(motif_gene, "ARCH\\d+__")]
mt_dt[, motif_gene := NULL]
dt_atac <- melt.data.table(
  mt_dt,
  id.vars = c("gene", "motif"),
  measure.vars = grep("ATAC", colnames(mt_dt), value = TRUE),
  value.name = "ATAC", variable.name = "comparison"
)
dt_atac[, comparison := str_remove(comparison, "_ATAC")]

dt_rna <- melt.data.table(
  mt_dt,
  id.vars = "gene",
  measure.vars = grep("RNA", colnames(mt_dt), value = TRUE),
  value.name = "RNA", variable.name = "comparison"
)
dt_rna[, comparison := str_remove(comparison, "_RNA")]

dt_comp <- merge.data.table(
  dt_rna, dt_atac,
  by = c("gene", "comparison"), all = TRUE
)
setcolorder(dt_comp, c("motif", "gene", "ATAC", "RNA"))
dt_comp[, reporterline := str_extract(comparison, "Elav|Fox|Ncol")]
dt_comp[, vs := str_extract(comparison, "vs.*")]
dt_comp[, vs := str_remove(vs, "vs_")]
dt_comp <- merge.data.table(
  dt_comp, dt_cors, by = c("motif", "gene"),
  all.x = TRUE, sort = FALSE
)

# add gene annotation
dt_comp <- merge.data.table(
  dt_comp, gene_ann, by = "gene",
  all.x = TRUE, sort = FALSE
)

# save
fwrite(
  dt_comp,
  file.path(res_dir, "correlation_expression_binding.tsv"),
  sep = "\t"
)

# label for plotting
dt_comp[, label := substr(name, 1, 20)]

# posterboys to label
dt_lab <- dt_comp[cor > 0.5 & abs(RNA) > 1 & abs(ATAC) > 0.1]

# plot
gp_comp <- ggplot(dt_comp, aes(RNA, ATAC, color = correlation)) +
  geom_point(data = dt_comp[correlation == "no"], alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_text_repel(
    data = dt_lab[ATAC > 0],
    aes(label = label),
    color = "black",
    size = 3,
    segment.color = "grey50",
    min.segment.length = 0,
    nudge_y = 0.5 - dt_lab[ATAC > 0]$ATAC,
  ) +
  geom_text_repel(
    data = dt_lab[ATAC < 0],
    aes(label = label),
    color = "black",
    size = 3,
    segment.color = "grey50",
    min.segment.length = 0,
    nudge_y = -0.5 - dt_lab[ATAC < 0]$ATAC,
  ) +
  geom_point(data = dt_comp[correlation != "no"], alpha = 0.8) +
  scale_color_manual(values = c(
    "positive" = "green",
    "no" = "grey"
  )) +
  scale_y_continuous(expand = c(0.1, 0.1)) +
  facet_grid(vs ~ reporterline) +
  theme(legend.position = "bottom")
gp_comp
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Inspect correlation of binding and expression per gene

# binding correlations with expression
dt_comp <- fread(
  file.path(res_dir, "correlation_expression_binding.tsv")
)
dt_comp_cor <- unique(
  dt_comp[correlation != "no"][, .(motif, gene, reporterline)]
)

# for individual genes
comp_marks <- dt_comp[gene %in% marks_gold$gene]
comp_marks_gp <- ggplot(comp_marks, aes(ATAC, RNA, color = reporterline, shape = vs)) +
  scale_color_manual(values = line_cols) +
  scale_shape_manual(values = c("Elav" = 15, "Fox" = 16, "Ncol" = 17, "neg" = 18)) +
  facet_wrap("name") +
  geom_hline(yintercept = 0, linetype = 2, size = 0.5) +
  geom_vline(xintercept = 0, linetype = 2, size = 0.5) +
  geom_point(size = 2) +
  ggpubr::stat_cor(method = "pearson", label.x = -0.5, label.y = 10, color = "black") +
  theme(legend.position = "bottom") +
  labs(x = "motif binding logFC", y = "expression logFC")
comp_marks_gp
## Warning: Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Caused by error in `cor.test.default()`:
## ! not enough finite observations

Gene networks

Load baseline networks from BINDetect bound targets that are also expressed in given sample, and subset for those TFs for which binding and expression are correlated.

# binding correlations with expression
dt_comp <- fread(
  file.path(res_dir, "correlation_expression_binding.tsv")
)

# binding targets
mo_dt <- fread(
  file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
mo_dt <- mo_dt[expressed == TRUE & target_expressed == TRUE]
mo_dt <- mo_dt[grep("pos", sample)]
mo_dt <- mo_dt[expression_lfc > 0]
mo_dt[, reporterline := str_remove(sample, "_pos")]

# add TF-target gene expression correlation
g_cors_dt <- unique(mo_dt[, .(gene, target_gene)])
g_cors_dt <- g_cors_dt[gene %in% rownames(tpms_mt)]
g_cors_dt <- g_cors_dt[target_gene %in% rownames(tpms_mt)]
g_cors_dt[, expression_correlation := cor(
  unlist(tpms_mt[gene, ]),
  unlist(tpms_mt[target_gene, ])
), by = 1:nrow(g_cors_dt)]
mo_dt <- merge.data.table(
  mo_dt, g_cors_dt, by = c("gene", "target_gene"),
  all.x = TRUE, sort = FALSE
)
mo_dt <- mo_dt[!is.na(expression_correlation)]
mo_dt[, c("expressed", "target_expressed") := NULL]

# save
fwrite(
  mo_dt,
  file.path(res_dir, "network.tsv.gz"),
  sep = "\t"
)

# select correlated motifs
mo_dt_cor <- merge.data.table(
  dt_comp_cor, mo_dt, by = c("motif", "gene", "reporterline"), sort = FALSE
)

# save
fwrite(
  mo_dt_cor,
  file.path(res_dir, "network_correlation.tsv.gz"),
  sep = "\t"
)

Filtering by expression

mo_dt <- fread(file.path(res_dir, "network.tsv.gz"))
mo_dt[, reporterline := NULL]
mo_dt[, target_TF := ifelse(target_gene %in% .SD$gene, TRUE, FALSE), sample]
mo_dt[, name := substr(name, 1, 30)]
mo_dt[, pfam := substr(pfam, 1, 30)]
mo_dt[, target_name := substr(target_name, 1, 30)]
mo_dt[, target_pfam := substr(target_pfam, 1, 30)]
mo_dt <- mo_dt[expression_lfc > 1.5][target_expression_lfc > 1]
setcolorder(mo_dt, c(
  "sample", "gene", "name", "pfam",
  "expression_tpm", "expression_lfc",
  "motif", "motif_name",
  "target_gene", "target_name", "target_pfam",
  "target_expression_tpm", "target_expression_lfc", 
  "expression_correlation", "n_samples"
))

How many target genes for each TF?

hits_dt <- mo_dt[, .N, .(gene, motif, sample)]
hits_gp <- ggplot(hits_dt, aes(sample, N, fill = sample)) +
  geom_boxplot(outlier.color = NA) +
  ggbeeswarm::geom_beeswarm(shape = 21) +
  scale_y_continuous(trans = "log10") +
  scale_fill_manual(values = line_cond_cols) +
  labs(y = "target genes per TF") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    legend.position = "none",
    panel.grid.major.y = element_line(linewidth = 0.5),
    panel.grid.minor.y = element_line(linewidth = 0.2)
  ) +
  geom_text(
    data = hits_dt[,.N,sample],
    aes(label = N, y = 15)
  )
hits_gp

Save network files

require(openxlsx)
wb <- createWorkbook()
for (smp in c("Elav", "Ncol", "Fox")) {
  smp_fn <- file.path(res_dir, sprintf("network_%s.tsv.gz", smp))
  smp_dt <- mo_dt[sample == paste0(smp, "_pos")]
  message("Saving: ", smp_fn)
  fwrite(smp_dt, smp_fn, sep = "\t")
  # all genes
  addWorksheet(wb, sheetName = smp)
  writeData(wb, sheet = smp, smp_dt)
  # TF-TF
  tfs_dt <- smp_dt[target_TF == TRUE]
  tfs_orph <- smp_dt[!gene %in% tfs_dt$gene, .SD[1], gene]
  tfs_orph[, target_gene := "none"]
  tfs_orph[, c("target_name", "target_pfam") := ""]
  tfs_orph[, c("target_expression_tpm", "target_expression_lfc", "n_samples", "expression_correlation") := 0]
  tfs_dt <- rbindlist(list(tfs_dt, tfs_orph), use.names = TRUE)
  addWorksheet(wb, sheetName = paste0(smp, "_TF"))
  writeData(wb, sheet = paste0(smp, "_TF"), tfs_dt)
}
## Saving: results/network_Elav.tsv.gz
## Saving: results/network_Ncol.tsv.gz
## Saving: results/network_Fox.tsv.gz
saveWorkbook(
    wb,
    file.path(res_dir, "network.xlsx"),
    overwrite = TRUE
)

Cnidocytes network

Comparison with Ozment et al. 2021 eLife paper.

# load data from paper and translate gene IDs to DToL IDs
pou_act <- readWorkbook(file.path(pub_dir, "elife-74336-supp6-v3.xlsx"), startRow = 2, colNames = TRUE)
pou_rep <- readWorkbook(file.path(pub_dir, "elife-74336-supp7-v3.xlsx"), startRow = 2, colNames = TRUE)
setDT(pou_act)
setDT(pou_rep)
pou_pub <- rbindlist(list(activated = pou_act, repressed = pou_rep), idcol = "relation")
setnames(pou_pub, "gene", "ID_Vienna")
dict <- fread(file.path("annotation", "DICTIONARY_Nvec_vienna_vs_Nvec_old_vs_Nvec_DTOL_v2.txt"))[, .(ID_Vienna, ID_DToL)]
setnames(dict, "ID_DToL", "gene")
pou_pub <- merge.data.table(pou_pub, dict, all.x = TRUE, sort = FALSE)
pou_pub <- pou_pub[, .(gene, relation, fold_change, mean_counts, pvalue)]
setnames(pou_pub, "gene", "target_gene")

# load our Ncol network
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]

# overlap of Pou4 targets
pou_pub_act <- pou_pub[relation == "activated", .(target_gene, fold_change)]
setnames(pou_pub_act, "fold_change", "activated")
pou_pub_rep <- pou_pub[relation == "repressed", .(target_gene, fold_change)]
setnames(pou_pub_rep, "fold_change", "repressed")
pou_net_gen <- pou_net[, .(target_gene, target_expression_lfc)]
setnames(pou_net_gen, "target_expression_lfc", "network")
pou_dt <- merge.data.table(merge.data.table(
  pou_pub_act, pou_pub_rep,
  by = "target_gene", all = TRUE
), pou_net_gen, by = "target_gene", all = TRUE)
pou_mt <- as.matrix(pou_dt[, -1])
pou_mt[!is.na(pou_mt)] <- 1
pou_mt[is.na(pou_mt)] <- 0

require(eulerr)
fit <- euler(pou_mt)
p1 <- plot(
  fit,
  quantities = TRUE
)
print(p1)

Inspect the differences

# reverse activation and repression sign (mutants)
pou_dt[, published := -1 * activated][is.na(published), published := -1 * repressed]

# sets of target genes
missing_in_network <- unique(pou_dt[!is.na(published) & is.na(network)]$target_gene)
missing_in_network_act <- unique(pou_dt[!is.na(activated) & is.na(network)]$target_gene)
missing_in_network_rep <- unique(pou_dt[!is.na(repressed) & is.na(network)]$target_gene)
missing_in_published <- unique(pou_dt[is.na(published)]$target_gene)
target_intersect <- unique(pou_dt[!is.na(published) & !is.na(network)]$target_gene)
mo_dt <- fread(
  file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
target_bound <- unique(mo_dt[sample=="Ncol_pos" & gene == pou4]$target_gene)

# expression for target genes
pou_expression_dt <- unique(ncol_dt[, .(target_gene, target_expression_lfc)])
# pou_expression_dt[target_gene %in% missing_in_network, annot := "published"]
pou_expression_dt[target_gene %in% missing_in_network_act, annot := "activated"]
pou_expression_dt[target_gene %in% missing_in_network_rep, annot := "repressed"]
pou_expression_dt[target_gene %in% missing_in_published, annot := "network"]
pou_expression_dt[target_gene %in% target_intersect, annot := "intersect"]
pou_expression_dt <- pou_expression_dt[!is.na(annot)]
pou_expression_dt[, bound := ifelse(target_gene %in% target_bound, "binding", "no binding")]
# plot
pou_expression_dt[, annot := factor(
  annot, levels = c("network", "intersect", "activated", "repressed")
)]
gp_pou_exp <- ggplot(pou_expression_dt, aes(annot, target_expression_lfc, fill = bound)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom() +
  geom_boxplot(outlier.color = NA, alpha = 0.5) +
  scale_fill_manual(values = c("grey40", "grey80")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  labs(x = "target genes", y = "expression log FC")

Genes in intersect

pou_int <- pou_net[target_gene %in% target_intersect]
pou_int <- merge.data.table(
  pou_int, pou_pub[, .(target_gene, relation)],
  by = "target_gene", all.x = TRUE, sort = FALSE
)
pou_int[, target_TF := target_gene %in% tfs_annt$gene]
# plot
gp_pou_int <- ggplot(pou_int, aes(
  expression_correlation, target_expression_lfc,
  size = 1/n_samples,
  shape = target_TF,
  fill = relation,
  label = target_name
)) +
  ggrepel::geom_text_repel(color = "black") +
  geom_point(color = "grey10") +
  scale_size_continuous(
    range = c(1, 6),
    breaks = seq(1/3, 1, length.out = 3),
    labels = seq(3, 1),
    name = "target in number of samples\n(Elav, Ncol, Fox)"
  ) +
  scale_shape_manual(
    values = c("TRUE" = 22, "FALSE" = 21),
    name = "target TF?"
  ) +
  scale_fill_manual(
    values = c("activated" = "grey90", "repressed" = "grey50"),
    name = "relation"
  ) +
  guides(
    fill = guide_legend(override.aes = list(size = 5, shape = 21)),
    shape = guide_legend(override.aes = list(size = 5)),
    size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
  ) +
  labs(
    x = "Pou4 expression correlation",
    y = "target gene expression logFC",
    title = "Intersect target genes"
  )
gp_pou_int
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps